https://www.tidytextmining.com/ - text mining with R book
install.packages("janeaustenr")
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/janeaustenr_0.1.5.tgz'
Content type 'application/x-gzip' length 1620949 bytes (1.5 MB)
==================================================
downloaded 1.5 MB
The downloaded binary packages are in
/var/folders/_4/6fl_4mzn4nq4f3jpfzqydbz0cql0q5/T//RtmppVA9aE/downloaded_packages
library(janeaustenr)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(stringr)
Julia Silge and David Robinson
https://cran.r-project.org/web/packages/tidytext/vignettes/tidytext.html
install.packages("tidytext")
also installing the dependencies ‘SnowballC’, ‘ISOcodes’, ‘hunspell’, ‘tokenizers’, ‘stopwords’
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/SnowballC_0.5.1.tgz'
Content type 'application/x-gzip' length 3182388 bytes (3.0 MB)
==================================================
downloaded 3.0 MB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/ISOcodes_2017.09.27.tgz'
Content type 'application/x-gzip' length 300128 bytes (293 KB)
==================================================
downloaded 293 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/hunspell_2.9.tgz'
Content type 'application/x-gzip' length 2107672 bytes (2.0 MB)
==================================================
downloaded 2.0 MB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/tokenizers_0.1.4.tgz'
Content type 'application/x-gzip' length 264190 bytes (257 KB)
==================================================
downloaded 257 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/stopwords_0.9.0.tgz'
Content type 'application/x-gzip' length 132364 bytes (129 KB)
==================================================
downloaded 129 KB
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/tidytext_0.1.6.tgz'
Content type 'application/x-gzip' length 2787473 bytes (2.7 MB)
==================================================
downloaded 2.7 MB
The downloaded binary packages are in
/var/folders/_4/6fl_4mzn4nq4f3jpfzqydbz0cql0q5/T//RtmppVA9aE/downloaded_packages
tidy_books <- original_books %>%
unnest_tokens(word, text)
tidy_books
data("stop_words")
cleaned_books <- tidy_books %>%
anti_join(stop_words)
Joining, by = "word"
nrcjoy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "Emma") %>%
semi_join(nrcjoy) %>%
count(word, sort=TRUE)
Joining, by = "word"
library(tidyr)
library(tidyr)
bing <- get_sentiments("bing")
janeaustensentiment <- tidy_books %>%
inner_join(bing) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
Joining, by = "word"
library(ggplot2) # plot sentiment scores accross plot trajectory of each novel
ggplot(janeaustensentiment, aes(index, sentiment, fill = book)) +
geom_bar(stat = "identity", show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
# finding positive and negative words - analyze word counts that contribute to each sentiment
bing_word_counts <- tidy_books %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
Joining, by = "word"
bing_word_counts
bing_word_counts %>%
filter(n > 150) %>%
mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ylab("contribution to sentiment")
install.packages("wordcloud")
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/wordcloud_2.5.tgz'
Content type 'application/x-gzip' length 143945 bytes (140 KB)
==================================================
downloaded 140 KB
The downloaded binary packages are in
/var/folders/_4/6fl_4mzn4nq4f3jpfzqydbz0cql0q5/T//RtmppVA9aE/downloaded_packages
library(wordcloud)
Loading required package: RColorBrewer
cleaned_books %>%
count(word) %>%
with(wordcloud(word, n, max.words = 75))
library(reshape2)
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
tidy_books %>%
inner_join(bing) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("#F8766D", "#00BFC4"), max.words = 75)
Joining, by = "word"
PandP_sentences$sentence[2]
[1] "however little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter != 0) %>%
top_n(1)
Joining, by = "word"
Selecting by ratio